3.0-Normalization and Embedding¶

Inés Sentís

Date of execution

In [1]:
Sys.Date()
2025-01-09

Introduction¶

Normalize data and create embeddings for the each time point fraction

In [2]:
sample <- "41BBneg"

Libraries¶

In [3]:
suppressMessages(suppressWarnings({
library(Seurat)
library(tidyverse)
library(grid)
library(gridExtra)
library(ggplot2)
library(scater) 
library(scran)
}))

Parameters¶

In [4]:
#here::dr_here(show_reason = TRUE)
source(here::here("SCGRES_124_125/sc_analysis/misc/paths.R"))
source(here::here("SCGRES_124_125/sc_analysis/misc/variables.R"))

"{clust}/{plt_dir}" %>%
  glue::glue() %>%
  here::here() %>%
  dir.create(path = .,
           showWarnings = FALSE,
           recursive = TRUE)

"{clust}/{robj_dir}" %>%
  glue::glue() %>%
  here::here() %>%
  dir.create(path = .,
           showWarnings = FALSE,
           recursive = TRUE)

set.seed(0)

Load data¶

In [5]:
seurat_obj <- readRDS(here::here(glue::glue("{qc}/{robj_dir}/clean_combined_object_{sample}.rds")))
In [6]:
var_name <- paste0("comp_", sample)
dcomp <- get(var_name)

Normalization and linear dimensional reduction¶

In [7]:
seurat_obj <- NormalizeData(
  seurat_obj,
  normalization.method = "LogNormalize",
  scale.factor = 1e4
)
In [8]:
sce <- as.SingleCellExperiment(seurat_obj)
sce
class: SingleCellExperiment 
dim: 25556 11297 
metadata(0):
assays(2): counts logcounts
rownames(25556): AL627309.1 AL627309.5 ... AC004556.3 AC007325.4
rowData names(0):
colnames(11297): AAACCTGAGAGTACAT-1 AAACCTGAGATATGCA-1 ...
  TTTGTCATCGCCGTGA-1 TTTGTCATCTGTTGAG-1
colData names(12): orig.ident nCount_RNA ... doublet_pred ident
reducedDimNames(0):
mainExpName: RNA
altExpNames(0):
In [9]:
gene_var <- modelGeneVar(sce)

tops <- gene_var %>% 
    as.data.frame() %>% 
    arrange(desc(total)) %>% 
    head(n=20)

gene_var %>% 
  as.data.frame() %>% 
  ggplot(aes(mean, total)) +
  geom_point() +
  geom_line(aes(y = tech), colour = "dodgerblue", size = 1) +
  labs(x = "Mean of log-expression", y = "Variance of log-expression")+
  geom_text(data=tops, aes(mean,total,label=rownames(tops)))
Warning message:
“Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.”
In [10]:
gene_var %>% 
    as.data.frame() %>% 
    arrange(desc(total)) %>% 
    head(n=20)
A data.frame: 20 × 6
meantotaltechbiop.valueFDR
<dbl><dbl><dbl><dbl><dbl><dbl>
CCL53.22269632.60945850.29043582.3190227 0.000000e+00 0.000000e+00
GNLY0.94230852.53119150.51355912.01763252.568624e-2623.237750e-258
AL138963.41.73977911.33688830.47214060.8647477 9.119723e-59 6.966916e-56
LGALS11.50778671.23237990.50411830.7282616 2.435389e-37 9.302447e-35
GZMA1.79558621.19642150.46177510.7346464 7.492485e-45 3.935116e-42
SELL1.14917391.13178250.52825250.6035300 4.322928e-24 1.197594e-21
NKG70.91501501.12073350.50968090.6110527 2.478304e-26 7.181385e-24
HIST1H1E1.32598461.02563160.52127140.5043602 8.246346e-18 1.718102e-15
IL7R1.88396400.98796110.44414440.5438167 2.211869e-27 6.560142e-25
CD742.08706370.93500740.40503740.5299700 5.442901e-31 1.759173e-28
HOPX0.92276700.88129630.51082110.3704753 8.655446e-11 1.069626e-08
CD8B0.95140170.86882000.51475570.3540643 7.061222e-10 7.672992e-08
CTSW1.44251710.86420030.51093150.3532688 5.799620e-10 6.384647e-08
HIST1H4C1.21712910.84620080.52747760.3187231 5.235735e-08 4.313493e-06
ZNF6830.54868580.81615500.40568510.4104700 2.656749e-19 6.144647e-17
TUBA1B0.62216030.79822420.43602840.3621958 1.322677e-13 2.137481e-11
HIST1H1D0.89544630.79560850.50664810.2889603 2.584339e-07 1.840429e-05
TUBB1.19285950.79470720.52800850.2666987 4.378871e-06 2.363840e-04
CD8A0.97042810.79236610.51708800.2752781 1.395038e-06 8.433793e-05
KLRK10.96991190.78239970.51702760.2653721 3.127099e-06 1.767582e-04
In [11]:
hvgs <- getTopHVGs(gene_var,fdr.threshold = 0.05)
length(hvgs)
1168
In [12]:
hvgs <- hvgs[!grepl("^TR[AB][VJC]", hvgs)]
length(hvgs)
1067
In [13]:
seurat_obj <- seurat_obj %>%
  ScaleData(features=hvgs) %>% 
  RunPCA(features=hvgs)
Centering and scaling data matrix

PC_ 1 
Positive:  LTB, FTL, IL7R, TXNIP, EEF1B2, CD52, S1PR1, CCR7, RIPOR2, SELL 
	   HCST, NOSIP, TCF7, MTRNR2L12, MALAT1, TMSB10, AL138963.4, NELL2, CR1, FCER1G 
	   S100B, IL32, IGLV2-14, CTSW, S100A4, CCL5, IFI44L, GZMK, PDE3B, AIRE 
Negative:  UBE2C, TOP2A, ASPM, CDK1, KIFC1, DLGAP5, GTSE1, RRM2, HJURP, CKAP2L 
	   CDCA8, CDCA2, HMMR, KNL1, BIRC5, CENPF, MKI67, NUSAP1, KIF23, TPX2 
	   KIF15, PLK1, KIF14, CDCA3, CCNB2, CCNA2, KIF2C, ANLN, CDKN3, NCAPG 
PC_ 2 
Positive:  EEF1B2, IL7R, PDE7B, BEX3, SELL, TPM2, TCF7, CCR7, S1PR1, MYL9 
	   CRABP2, ALDH7A1, S100A13, KRT1, TNNT1, PDE3B, PEG3, MDK, H19, MYH8 
	   AC104823.1, PAX3, LINC00271, AL138963.4, S100A1, MT1A, SNCG, STAR, TUBB3, RBMS3 
Negative:  NKG7, CCL5, GNLY, CST7, CCL4, CD74, GZMB, GZMH, HLA-DRB1, HLA-DRA 
	   GZMK, CXCR3, CD8A, HLA-DPA1, ALOX5AP, ZNF683, HOPX, HLA-DQA1, HLA-DPB1, CCL4L2 
	   KLRK1, GZMA, AOAH, OASL, CTSW, CD52, EOMES, HCST, LINC02694, AC243829.4 
PC_ 3 
Positive:  IFITM1, MALAT1, IFITM2, LTB, CD52, SELL, IL7R, RIPOR2, TXNIP, IL32 
	   MTRNR2L12, CD7, S1PR1, CCR7, NOSIP, PDE3B, TCF7, NELL2, LRRN3, CTSW 
	   VIM, ASPM, CKAP2L, HCST, CD96, EEF1B2, KIF15, KNL1, DIAPH3, KIF14 
Negative:  CRABP2, TPM2, S100A13, MYL9, ALDH7A1, TNNT1, PEG3, BEX3, MDK, MYH8 
	   S100A1, MT1A, PAX3, H19, AC104823.1, SNCG, SPARC, MT1M, TUBB3, STAR 
	   TCIM, APOC1, LINC00882, COL1A1, TCF21, PALMD, DMKN, FGFBP1, KCNE5, PITX2 
PC_ 4 
Positive:  IFITM2, CTSW, TMSB10, S100A4, CD52, IFITM1, LTB, HOPX, CD7, FTL 
	   HCST, NOSIP, S100A6, CCL5, SELL, LGALS1, CD8B, LINC02446, XCL1, KLRC4 
	   CD8A, KLRK1, CCR7, SPINK2, ZNF683, CXCR3, S1PR1, FXYD2, EEF1B2, S100B 
Negative:  PDE7B, MALAT1, GZMH, RBMS3, PDE3B, CCL4, EOMES, LINC00271, LINC02694, CCL4L2 
	   AC243829.4, KRT1, TIGIT, CST7, ZEB2, HPGDS, PPP1R14B, RTKN2, CCL3, PLEK 
	   GZMB, MTRNR2L12, AC013652.1, HLA-DQA1, C1orf21, OASL, GZMK, FCRL6, ITGB1, HLA-DPA1 
PC_ 5 
Positive:  GAPDH, IL32, ACTG1, SPINK2, XCL1, CD7, ENO1, DDIT4, CXCR6, IFITM2 
	   KRT1, FTL, RBMS3, LGALS1, IFITM1, PDE7B, KRT86, CD96, XCL2, LDHA 
	   HPGDS, ANXA1, FOXP3, CST7, HIST1H1C, RTKN2, HMGN2, HCST, KLRB1, ALOX5AP 
Negative:  RIPOR2, SELL, S1PR1, CCR7, NOSIP, TMSB10, TXNIP, CD8B, GZMK, C1orf21 
	   CR1, NELL2, LYAR, GZMH, HLA-DPB1, AHNAK, IGLV2-14, IL7R, LTB, EOMES 
	   MALAT1, HLA-DPA1, FGFBP2, CD8A, HLA-DRA, TCF7, PLEK, CCL4, LINC02446, PPP1R14B 

In [14]:
options(repr.plot.width = 8, repr.plot.height = 6)
ElbowPlot(seurat_obj, n=50)
In [15]:
options(repr.plot.width = 14, repr.plot.height = 6)
FeaturePlot(object = seurat_obj, reduction = "pca",
        features = c("nCount_RNA","nFeature_RNA"), order=T)
In [16]:
options(repr.plot.width = 16, repr.plot.height = 16, warn=-1,verbose = FALSE)
genes = c("CD3E", "CD3D",
          "CD4", "CD8A","CD8B","FOXP3",
          "TOP2A", "IL7R","SELL")

FeaturePlot(seurat_obj, reduction = "pca", 
            feature=genes, order = F, ncol=3)
In [17]:
options(repr.plot.width = 10, repr.plot.height = 6, warn=-1,verbose = FALSE)
VizDimLoadings(seurat_obj, dims = 1:2, reduction = "pca")

UMAP representation¶

In [18]:
seurat_obj <- RunUMAP(
  seurat_obj,
  dims = dcomp,
  reduction = "pca",
  reduction.name = "umap",
  reduction.key = "UMAP_"
)
15:54:20 UMAP embedding parameters a = 0.9922 b = 1.112

15:54:20 Read 11297 rows and found 15 numeric columns

15:54:20 Using Annoy for neighbor search, n_neighbors = 30

15:54:20 Building Annoy index with metric = cosine, n_trees = 50

0%   10   20   30   40   50   60   70   80   90   100%

[----|----|----|----|----|----|----|----|----|----|

*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
|

15:54:21 Writing NN index file to temp file /scratch_tmp/33937423/RtmpgIQqMo/file3ee442f067cf1

15:54:21 Searching Annoy index using 1 thread, search_k = 3000

15:54:25 Annoy recall = 100%

15:54:26 Commencing smooth kNN distance calibration using 1 thread
 with target n_neighbors = 30

15:54:27 Initializing from normalized Laplacian + noise (using irlba)

15:54:27 Commencing optimization for 200 epochs, with 475194 positive edges

15:54:40 Optimization finished

In [19]:
options(repr.plot.width = 8, repr.plot.height = 6, warn=-1,verbose = FALSE)
DimPlot(
  seurat_obj,
  reduction = "umap",
  pt.size = 0.1
) + ggtitle('UMAP') + 
    theme(plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))

More quality checks on UMAPs¶

Compute Cell-Cycle Scores¶

In [20]:
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
seurat_obj <- CellCycleScoring(seurat_obj, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

Plot several features¶

In [21]:
cat_vars <-c("Phase")
con_vars <- c("nCount_RNA", "nFeature_RNA", "pct_mt", "percent.ribo", 
              "doublet_score", "PTPRC", "CD4", "CD8A", "CD8B", "TOP2A")
vars <- c(cat_vars, con_vars)
In [22]:
# compute plots
list_plots <- lapply(vars, function(var){
  if (var %in% cat_vars) {
      p <- DimPlot(seurat_obj, reduction = "umap", group.by=var)
  } else {
      p <- FeaturePlot(seurat_obj, reduction = "umap", feature=var, order = TRUE)
  }
  return(p)
})
In [23]:
options(repr.plot.width = 16, repr.plot.height = 12, warn=-1,verbose = FALSE)
# show plots
cp <- cowplot::plot_grid(plotlist = list_plots,
                   align = "hv",
                   axis = "trbl",
                   ncol = 4,
                   nrow = 3)
cp
In [24]:
options(repr.plot.width = 15, repr.plot.height = 22, warn=-1,verbose = FALSE)
genes = c("PTPRC","SMARCB1","TOP2A", "MKI67","CD3E", "CD3D",
           "CD4", "CD8A","CD8B", "TRDC","TRGV1","TRGV2", "FOXP3")

FeaturePlot(seurat_obj, reduction = "umap", 
            feature=genes, order = T, ncol=3)

Save¶

In [25]:
saveRDS(seurat_obj, here::here(glue::glue("{clust}/{robj_dir}/dimred_combined_object_{sample}.rds")))

Session Info¶

In [26]:
sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/groups/singlecell/isentis/conda_envs/ines_r4.1.1c/lib/libopenblasp-r0.3.24.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=es_ES.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    grid      stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] scran_1.22.1                scater_1.22.0              
 [3] scuttle_1.4.0               SingleCellExperiment_1.16.0
 [5] SummarizedExperiment_1.24.0 Biobase_2.54.0             
 [7] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
 [9] IRanges_2.28.0              S4Vectors_0.32.4           
[11] BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
[13] matrixStats_1.0.0           gridExtra_2.3              
[15] lubridate_1.9.3             forcats_1.0.0              
[17] stringr_1.5.0               dplyr_1.1.3                
[19] purrr_1.0.2                 readr_2.1.4                
[21] tidyr_1.3.0                 tibble_3.2.1               
[23] ggplot2_3.4.4               tidyverse_2.0.0            
[25] SeuratObject_4.1.4          Seurat_4.0.5               

loaded via a namespace (and not attached):
  [1] uuid_1.1-1                plyr_1.8.9               
  [3] igraph_1.3.4              repr_1.1.6               
  [5] lazyeval_0.2.2            sp_2.1-0                 
  [7] splines_4.1.1             BiocParallel_1.28.3      
  [9] listenv_0.9.0             scattermore_1.2          
 [11] digest_0.6.33             htmltools_0.5.6.1        
 [13] viridis_0.6.4             fansi_1.0.5              
 [15] magrittr_2.0.3            ScaledMatrix_1.2.0       
 [17] tensor_1.5                cluster_2.1.4            
 [19] ROCR_1.0-11               limma_3.50.3             
 [21] tzdb_0.4.0                globals_0.16.2           
 [23] timechange_0.2.0          spatstat.sparse_3.0-1    
 [25] colorspace_2.1-0          ggrepel_0.9.4            
 [27] crayon_1.5.2              RCurl_1.98-1.12          
 [29] jsonlite_1.8.7            progressr_0.14.0         
 [31] spatstat.data_3.0-1       survival_3.5-7           
 [33] zoo_1.8-12                glue_1.6.2               
 [35] polyclip_1.10-4           gtable_0.3.4             
 [37] zlibbioc_1.40.0           XVector_0.34.0           
 [39] leiden_0.4.3              DelayedArray_0.20.0      
 [41] BiocSingular_1.10.0       future.apply_1.11.0      
 [43] abind_1.4-5               scales_1.2.1             
 [45] edgeR_3.36.0              spatstat.random_3.1-5    
 [47] miniUI_0.1.1.1            Rcpp_1.0.10              
 [49] viridisLite_0.4.2         xtable_1.8-4             
 [51] dqrng_0.3.1               reticulate_1.34.0        
 [53] spatstat.core_2.4-2       rsvd_1.0.5               
 [55] metapod_1.2.0             htmlwidgets_1.6.2        
 [57] httr_1.4.7                RColorBrewer_1.1-3       
 [59] ellipsis_0.3.2            ica_1.0-3                
 [61] farver_2.1.1              pkgconfig_2.0.3          
 [63] uwot_0.1.16               deldir_1.0-9             
 [65] here_1.0.1                locfit_1.5-9.8           
 [67] utf8_1.2.3                labeling_0.4.3           
 [69] tidyselect_1.2.0          rlang_1.1.1              
 [71] reshape2_1.4.4            later_1.3.1              
 [73] munsell_0.5.0             tools_4.1.1              
 [75] cli_3.6.1                 generics_0.1.3           
 [77] ggridges_0.5.4            evaluate_0.22            
 [79] fastmap_1.1.1             goftest_1.2-3            
 [81] fitdistrplus_1.1-11       RANN_2.6.1               
 [83] sparseMatrixStats_1.6.0   pbapply_1.7-2            
 [85] future_1.33.0             nlme_3.1-162             
 [87] mime_0.12                 compiler_4.1.1           
 [89] beeswarm_0.4.0            plotly_4.10.2            
 [91] png_0.1-8                 spatstat.utils_3.0-3     
 [93] statmod_1.5.0             stringi_1.7.12           
 [95] bluster_1.4.0             lattice_0.21-8           
 [97] IRdisplay_1.1             Matrix_1.6-1.1           
 [99] vctrs_0.6.4               pillar_1.9.0             
[101] lifecycle_1.0.3           spatstat.geom_3.2-5      
[103] lmtest_0.9-40             BiocNeighbors_1.12.0     
[105] RcppAnnoy_0.0.21          data.table_1.14.8        
[107] cowplot_1.1.1             bitops_1.0-7             
[109] irlba_2.3.5.1             httpuv_1.6.11            
[111] patchwork_1.1.3           R6_2.5.1                 
[113] promises_1.2.1            KernSmooth_2.23-22       
[115] vipor_0.4.7               parallelly_1.36.0        
[117] codetools_0.2-19          MASS_7.3-60              
[119] rprojroot_2.0.3           withr_2.5.1              
[121] sctransform_0.4.0         GenomeInfoDbData_1.2.7   
[123] mgcv_1.8-42               parallel_4.1.1           
[125] hms_1.1.3                 beachmat_2.10.0          
[127] rpart_4.1.19              IRkernel_1.3.2.9000      
[129] DelayedMatrixStats_1.16.0 Cairo_1.6-2              
[131] Rtsne_0.16                pbdZMQ_0.3-10            
[133] shiny_1.7.5.1             base64enc_0.1-3          
[135] ggbeeswarm_0.7.2